# Replication code
# Note: Cos


# Step 1

library(readtext)
corpus.df <-
  readtext::readtext("Proposte di legge")

saveRDS(corpus.df, file = "corpus_raw.rds")

# Step 2

library(tidyverse)
library(tidytext)
library(stopwords)

corpus.df <- 
  readRDS("corpus_raw.rds") %>%
  dplyr::filter(!grepl("rouss", doc_id, ignore.case = T))

## pdf

corpus_pdf.df <- 
  corpus.df %>%
  dplyr::filter(grepl("pdf", doc_id, ignore.case = T))

corpus_pdf.df$text <- 
  gsub("(–|-|—)\\n", "", corpus_pdf.df$text)

corpus_pdf.df$where <- 
  unlist(lapply(stringr::str_locate_all(corpus_pdf.df$text, 
                                        pattern = regex("(DISEGNO|PROPOSTA) DI LEGGE")), 
                max))

corpus_pdf.df$text_clean <- 
  stringr::str_sub(corpus_pdf.df$text, corpus_pdf.df$where + 1)

corpus_pdf.df$text_clean <- 
  gsub("\\s+"," ",corpus_pdf.df$text_clean)

corpus_pdf.df$text_clean <-
  str_remove_all(corpus_pdf.df$text_clean,
                  pattern = regex("Atti (p|P)arlamentari (–|-|—) [0-9]+ (–|-|—) Senato della Repubblica (–|-|—) N. [0-9]+"))

corpus_pdf.df$text_clean <-
  str_remove_all(corpus_pdf.df$text_clean,
                 pattern = regex("Atti (p|P)arlamentari (–|-|—) [0-9]+ (–|-|—) Camera dei Deputati( — [0-9]+)?"))

corpus_pdf.df$text_clean <-
  str_remove_all(corpus_pdf.df$text_clean,
                 pattern = regex("XVII(I)? LEGISLATURA (–|-|—) DISEGNI DI LEGGE E RELAZIONI (–|-|—) DOCUMENTI"))

corpus_pdf.df$text_clean <-
  str_remove_all(corpus_pdf.df$text_clean,
                 pattern = regex("Atti Parlamentari (–|-|—) [0-9]+ (–|-|—) Camera dei Deputati"))

corpus_pdf.df$text_clean <-
  str_remove_all(corpus_pdf.df$text_clean,
                 pattern = regex("XVII(I)? LEGISLATURA A.C. [0-9]+"))

corpus_pdf.df$text_clean <-
  str_remove_all(corpus_pdf.df$text_clean,
                 pattern = regex("PAGINA BIANCA"))

corpus_pdf.df$text_clean <-
  str_remove_all(corpus_pdf.df$text_clean,
                 pattern = regex("PAGINA BIANCA"))

corpus_pdf.df$text_clean <-
  str_remove_all(corpus_pdf.df$text_clean,
                 pattern = regex("\\*(.*)\\*"))

# sample(corpus_pdf.df$text_clean, 1)

corpus_pdf.tidy <- 
  corpus_pdf.df %>%
  dplyr::select(-text, -where) %>%
  unnest_sentences(sentence, text_clean) %>%
  dplyr::filter(stringr::str_count(sentence, "[a-z]") != 0) %>%
  dplyr::mutate(sentence = gsub("([a-z])(–|-|—) " , "\\1", sentence)) %>%
  dplyr::filter(!grepl("^(__ )?art.$", sentence)) %>%
  dplyr::filter(str_count(sentence, "\\S+") > 3) # number of words

corpus_pdf.tidy <- 
  corpus_pdf.tidy %>%
  dplyr::group_by(doc_id) %>%
  dplyr::mutate(sentence_n = 1:n())

corpus_pdf.tidy$file <-
  corpus_pdf.tidy$doc_id

corpus_pdf.tidy$doc_id <- 
  gsub(".pdf", "", corpus_pdf.tidy$doc_id)

corpus_pdf.tidy <- 
  corpus_pdf.tidy[,c("doc_id","sentence_n","sentence","file")]

write.csv(corpus_pdf.tidy, file = "corpus_pdf.tidy.csv", row.names = F)

sum(str_count(corpus_pdf.tidy$sentence, "\\S+")>512) /
  nrow(corpus_pdf.tidy)

## docx

corpus_docx.df <- 
  corpus.df %>%
  dplyr::filter(grepl("docx", doc_id, ignore.case = T))

corpus_docx.df$where1 <- 
  unlist(lapply(stringr::str_locate_all(corpus_docx.df$text, 
                                        pattern = regex("\\n(Art.|Articolo|ART.|ARTICOLO)([ ]+)?1")), 
                min))

corpus_docx.df$where2 <- 
  unlist(lapply(stringr::str_locate_all(corpus_docx.df$text, 
                                        pattern = regex(". (Art.|Articolo|ART.|ARTICOLO)([ ]+)?1")), 
                min))

corpus_docx.df$where3 <- 
  unlist(lapply(stringr::str_locate_all(corpus_docx.df$text, 
                                        pattern = regex("^(Art.|Articolo|ART.|ARTICOLO)([ ]+)?1")), 
                min))

corpus_docx.df$where4 <- 
  unlist(lapply(stringr::str_locate_all(corpus_docx.df$text, 
                                        pattern = regex("(Art.|Articolo|ART.|ARTICOLO)([ ]+)?1")), 
                min))

corpus_docx.df$where <- 
  apply(corpus_docx.df[,c("where1","where2","where3","where4")], 
        MARGIN = 1, FUN = function(x) x[min(which(!is.infinite(x)))])

corpus_docx.df$where[is.na(corpus_docx.df$where)] <- 1

corpus_docx.df$text_clean <- 
  stringr::str_sub(corpus_docx.df$text, corpus_docx.df$where + 1)

corpus_docx.tidy <- 
  corpus_docx.df %>%
  dplyr::select(-text, -where1, -where2, -where3, -where4, -where) %>%
  tidytext::unnest_sentences(sentence, text_clean) %>%
  dplyr::filter(stringr::str_count(sentence, "[a-z]") != 0) %>%
  dplyr::filter(!grepl("^(__ )?art.$", sentence)) %>%
  dplyr::filter(str_count(sentence, "\\S+") > 3) %>% # number of words
  dplyr::mutate(sentence = stringr::str_squish(sentence))

corpus_docx.tidy <- 
  corpus_docx.tidy %>%
  dplyr::group_by(doc_id) %>%
  dplyr::mutate(sentence_n = 1:n())

corpus_docx.tidy$file <-
  corpus_docx.tidy$doc_id

corpus_docx.tidy$doc_id <- 
  gsub(".docx", "", corpus_docx.tidy$doc_id)

corpus_docx.tidy <- 
  corpus_docx.tidy[,c("doc_id","sentence_n","sentence","file")]

write.csv(corpus_docx.tidy, file = "corpus_docx.tidy.csv", row.names = F)

sum(str_count(corpus_docx.tidy$sentence, "\\S+")>512) /
  nrow(corpus_docx.tidy)

# Optional Step (Cosine similarity by word was not included in the analysis)

corpus_pdf.tidy_token <- 
  corpus_pdf.df %>%
  dplyr::select(-text, -where) %>%
  tidytext::unnest_tokens(word, text_clean) %>%
  dplyr::filter(stringr::str_count(word, "[a-z]") != 0) %>%
  dplyr::mutate(word = gsub("([a-z])(–|-|—) " , "\\1", word)) %>%
  dplyr::filter(!grepl("^(__ )?art.$", word)) %>%
  count(doc_id, word) %>%
  dplyr::filter(!word %in% stopwords::stopwords("it", source = "snowball"))

corpus_pdf.tidy_token$file <-
  corpus_pdf.tidy_token$doc_id

corpus_pdf.tidy_token$doc_id <- 
  gsub(".pdf", "", corpus_pdf.tidy_token$doc_id)

corpus_pdf.tidy_token <- 
  corpus_pdf.tidy_token[,c("doc_id","word", "n","file")]

corpus_pdf.tidy_token <- 
  corpus_pdf.tidy_token %>% 
  bind_tf_idf(word, doc_id, n)

write.csv(corpus_pdf.tidy_token, file = "corpus_pdf.tidy_token.csv", row.names = F)

corpus_docx.tidy_token <- 
  corpus_docx.df %>%
  dplyr::select(-text, -where) %>%
  tidytext::unnest_tokens(word, text_clean) %>%
  dplyr::filter(stringr::str_count(word, "[a-z]") != 0) %>%
  dplyr::mutate(word = gsub("([a-z])(–|-|—) " , "\\1", word)) %>%
  dplyr::filter(!grepl("^(__ )?art.$", word)) %>%
  count(doc_id, word) %>%
  dplyr::filter(!word %in% stopwords::stopwords("it", source = "snowball"))

corpus_docx.tidy_token$file <-
  corpus_docx.tidy_token$doc_id

corpus_docx.tidy_token$doc_id <- 
  gsub(".docx", "", corpus_docx.tidy_token$doc_id)

corpus_docx.tidy_token <- 
  corpus_docx.tidy_token[,c("doc_id","word", "n","file")]

corpus_docx.tidy_token <- 
  corpus_docx.tidy_token %>% 
  bind_tf_idf(word, doc_id, n)

write.csv(corpus_docx.tidy_token, file = "corpus_docx.tidy_token.csv", row.names = F)

library(widyr)

cos_sim_results.token <- 
  data.frame()

for (this_doc_id in unique(corpus_docx.tidy_token$doc_id)) {
  
  corpus <- 
    bind_rows(
      corpus_docx.tidy_token %>%
        dplyr::filter(doc_id == this_doc_id),
      corpus_pdf.tidy_token %>%
        dplyr::filter(doc_id == this_doc_id)
    )
  
  cosine_similarity <- 
    corpus %>%
    pairwise_similarity(file, word, tf_idf)
  
  cos_sim_results.token <- 
    cos_sim_results.token %>%
    bind_rows(cosine_similarity[1,])
  
}

write.csv(cos_sim_results.token, file = "cos_sim_results.token.csv",  row.names = F)
